from pyspark import SparkConf, SparkContext

if __name__ == '__main__':
    # 构建SparkConf对象
    conf = SparkConf().setAppName("test").setMaster("local[*]")
    # 构建SparkContext执行环境入口对象
    sc = SparkContext(conf=conf)

    rdd = sc.parallelize([1, 1, 2, 3, 4, 4])
    rdd2 = sc.parallelize(["a", "a", "b", "b"])

    rdd = rdd.union(rdd2)
    """
    union:2个rdd连接
    1.可以看出union不会去重记录集
    2.union可以连接2个不同类型的rdd
    """
    print(rdd.collect())